Preparations

https://besjournals.onlinelibrary.wiley.com/doi/10.1111/j.1365-2656.2008.01390.x

Load the necessary libraries

library(gbm)         #for gradient boosted models
library(car)
library(dismo)
library(pdp)
library(ggfortify)
library(randomForest)
library(tidyverse)
library(gridExtra)
library(patchwork)

Scenario

Abalone are an important aquaculture shell fish that are farmed for both their meat and their shells. Abalone can live up to 50 years, although their longevity is known to be influenced by a range of environmental factors. Traditionally, abalone are aged by counting thier growth rings, however, this method is very laborious and expensive. Hence a study was conducted in which abalone growth ring counts were matched up with a range of other more easily measured physical characteristics (such as shell dimesions and weights) in order to see if any of these other parameters could be used as proxies for the number of growth rings (or age).

abalone

Format of abalone.csv data file

Read in the data

abalone = read_csv('../public/data/abalone.csv', trim_ws=TRUE)
glimpse(abalone)
## Rows: 4,177
## Columns: 10
## $ SEX          <chr> "M", "M", "F", "M", "I", "I", "F", "F", "M", "F", "F", "M…
## $ LENGTH       <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, 0…
## $ DIAMETER     <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, 0…
## $ HEIGHT       <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, 0…
## $ WHOLE_WEIGHT <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, 0…
## $ MEAT_WEIGHT  <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, 0…
## $ GUT_WEIGHT   <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, 0…
## $ SHELL_WEIGHT <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, 0…
## $ RINGS        <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 12…
## $ AGE          <dbl> 16.5, 8.5, 10.5, 11.5, 8.5, 9.5, 21.5, 17.5, 10.5, 20.5, …
abalone = abalone %>% mutate(SEX=factor(SEX))

Exploratory data analysis

Fit the model

Explore relative influence

Explore partial effects

Explore accuracy

Explore interactions

Tuning

nBoot <- 10
abalone.pred <- with(abalone,
                     expand.grid(SHELL_WEIGHT = modelr::seq_range(SHELL_WEIGHT, n = 100),
                                SEX = levels(SEX),
                                LENGTH = NA,
                                DIAMETER = NA,
                                HEIGHT = NA,
                                WHOLE_WEIGHT = NA,
                                MEAT_WEIGHT = NA,
                                GUT_WEIGHT = NA)
                     )
abalone.list <- vector('list', nBoot) 
abalone.list
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
abalone.sum <- vector('list', nBoot) 
for (i in 1:nBoot) {
    print(paste0('Boot number: ', i))
    ## Create random set
    abalone.rnd <- abalone %>%
        sample_n(size = n(), replace=TRUE)
    ## Fit the trees
    abalone.gbm = gbm(RINGS ~ SEX + LENGTH + DIAMETER + HEIGHT +
                          WHOLE_WEIGHT + MEAT_WEIGHT + GUT_WEIGHT +
                          SHELL_WEIGHT,
                      data=abalone.rnd,
                      distribution='poisson',
                      var.monotone=c(0,1,1,1,1,1,1,1),
                      n.trees=5000,
                      interaction.depth=5,
                      bag.fraction=0.5,
                      shrinkage=0.001,
                      train.fraction=1,
                      cv.folds=3)
    ## Determine the best number of trees
    (best.iter = gbm.perf(abalone.gbm,method='cv'))
    ## predict based on shell weight
    fit <- predict(abalone.gbm, newdata = abalone.pred, n.trees = best.iter) %>% exp()
    abalone.list[[i]] <- data.frame(abalone.pred, Boot = i, Fit = fit)
    ## relative influence
    abalone.sum[[i]] <- summary(abalone.gbm, n.trees = best.iter)
}
## [1] "Boot number: 1"

## [1] "Boot number: 2"

## [1] "Boot number: 3"

## [1] "Boot number: 4"

## [1] "Boot number: 5"

## [1] "Boot number: 6"

## [1] "Boot number: 7"

## [1] "Boot number: 8"

## [1] "Boot number: 9"

## [1] "Boot number: 10"

abalone.fit <- do.call('rbind', abalone.list)
abalone.fit <- abalone.fit %>%
    group_by(SHELL_WEIGHT, SEX) %>%
    ## summarise(Median = median(Fit),
    ##           Lower = quantile(Fit, p=0.025),
    ##           Upper = quantile(Fit, p=0.975))
    ggdist::median_hdci(Fit)       
g1 <- abalone.fit %>% ggplot(aes(y=Fit, x=SHELL_WEIGHT, fill=SEX, color=SEX)) +
    geom_ribbon(aes(ymin=.lower, ymax=.upper), alpha=0.3, color=NA) +
    geom_line() +
    scale_fill_viridis_d() +
    scale_colour_viridis_d() +
    theme_classic()

abalone.inf <- do.call('rbind', abalone.sum)
abalone.inf <- abalone.inf %>%
    group_by(var) %>%
    ggdist::median_hdci(rel.inf)       

g2 <- abalone.inf %>% ggplot(aes(y=var, x=rel.inf)) +
    geom_vline(xintercept=12.5, linetype='dashed') +
    geom_pointrange(aes(xmin=.lower, xmax=.upper)) +
    theme_classic()

g2 + patchwork::inset_element(g1, left=0.5, bottom=0.01, right=1, top=0.7)

g1 + patchwork::inset_element(g2, left=0.5, bottom=0.01, right=1, top=0.5)

Random Forest

library(randomForest)
abalone.rf = randomForest(RINGS ~ SEX + LENGTH + DIAMETER + HEIGHT +
                      WHOLE_WEIGHT + MEAT_WEIGHT + GUT_WEIGHT + SHELL_WEIGHT,
                      data=abalone, importance=TRUE,
                      ntree=1000)
abalone.imp = randomForest::importance(abalone.rf)
## Rank by either:
## *MSE (mean decrease in accuracy)
## For each tree, calculate OOB prediction error.
## This also done after permuting predictors.
## Then average diff of prediction errors for each tree
## *NodePurity (mean decrease in node impurity)
## Measure of the total decline of impurity due to each
## predictor averaged over trees
100*abalone.imp/sum(abalone.imp)
##                 %IncMSE IncNodePurity
## SEX          0.12480334      3.007474
## LENGTH       0.06201705      8.715762
## DIAMETER     0.06745577     10.497484
## HEIGHT       0.08890620     13.050985
## WHOLE_WEIGHT 0.06848899     14.899248
## MEAT_WEIGHT  0.13732241     13.552916
## GUT_WEIGHT   0.07246091     12.110066
## SHELL_WEIGHT 0.10750320     23.437107
varImpPlot(abalone.rf)

## use brute force
abalone.rf %>% pdp::partial('SHELL_WEIGHT') %>% autoplot